import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option("display.max_columns", None)
# pd.set_option('display.max_rows', None)
pd.set_option("display.max_rows", 200)
import statsmodels.api as sm
data = pd.read_csv('Loan_Modelling.csv') #load and read the csv file
df= data.copy() #making a copy to avoid changes to data
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")
#checking the shape of the dataset
np.random.seed(85)
df.sample(10) #loading random 10 rows
There are 5000 rows and 14 columns.
| ID | Age | Experience | Income | ZIPCode | Family | CCAvg | Education | Mortgage | Personal_Loan | Securities_Account | CD_Account | Online | CreditCard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3411 | 3412 | 63 | 37 | 118 | 94010 | 1 | 2.0 | 1 | 427 | 0 | 0 | 0 | 0 | 0 |
| 4241 | 4242 | 34 | 9 | 40 | 95054 | 4 | 2.0 | 2 | 0 | 0 | 0 | 0 | 1 | 1 |
| 2799 | 2800 | 64 | 39 | 85 | 94720 | 4 | 3.4 | 2 | 200 | 0 | 0 | 0 | 1 | 0 |
| 4132 | 4133 | 61 | 36 | 133 | 90266 | 1 | 2.6 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
| 3220 | 3221 | 61 | 35 | 28 | 93302 | 2 | 0.2 | 3 | 135 | 0 | 0 | 0 | 1 | 0 |
| 1614 | 1615 | 47 | 23 | 89 | 94920 | 1 | 2.6 | 2 | 0 | 0 | 1 | 1 | 1 | 1 |
| 4992 | 4993 | 30 | 5 | 13 | 90037 | 4 | 0.5 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3375 | 3376 | 43 | 18 | 88 | 90089 | 4 | 1.1 | 2 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4023 | 4024 | 51 | 25 | 175 | 90089 | 3 | 0.7 | 1 | 312 | 1 | 0 | 0 | 0 | 0 |
| 793 | 794 | 24 | -2 | 150 | 94720 | 2 | 2.0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
df.info() # looking at the structure of the data
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5000 entries, 0 to 4999 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 5000 non-null int64 1 Age 5000 non-null int64 2 Experience 5000 non-null int64 3 Income 5000 non-null int64 4 ZIPCode 5000 non-null int64 5 Family 5000 non-null int64 6 CCAvg 5000 non-null float64 7 Education 5000 non-null int64 8 Mortgage 5000 non-null int64 9 Personal_Loan 5000 non-null int64 10 Securities_Account 5000 non-null int64 11 CD_Account 5000 non-null int64 12 Online 5000 non-null int64 13 CreditCard 5000 non-null int64 dtypes: float64(1), int64(13) memory usage: 547.0 KB
df.Personal_Loan.unique()
array([0, 1], dtype=int64)
0 & 1. As it is a binary class variable, we will convert to category for further processing.from uszipcode import SearchEngine
search = SearchEngine(simple_zipcode=True)
""" The uszipcode is a programmable zipcode database in python.
The SearchEngine feature will help find the postal details of the Zipcode."""
county=[]
for i in range(len(df)):
z = search.by_zipcode(df.ZIPCode[i])
county.append(z.county)
df['County'] = county #Adding a new column for County
print(df.County.unique()) #Finding the unique county names
print('\nTotal unique Counties :',df.County.nunique())
print('\n Total Zipcodes with missing counties:',df.County.isna().sum())
['Los Angeles County' 'Alameda County' 'San Francisco County' 'San Diego County' 'Monterey County' 'Ventura County' 'Santa Barbara County' 'Marin County' 'Santa Clara County' 'Santa Cruz County' 'San Mateo County' 'Humboldt County' 'Contra Costa County' 'Orange County' 'Sacramento County' 'Yolo County' 'Placer County' 'San Bernardino County' 'San Luis Obispo County' 'Riverside County' 'Kern County' None 'Fresno County' 'Sonoma County' 'El Dorado County' 'San Benito County' 'Butte County' 'Solano County' 'Mendocino County' 'San Joaquin County' 'Imperial County' 'Siskiyou County' 'Merced County' 'Trinity County' 'Stanislaus County' 'Shasta County' 'Tuolumne County' 'Napa County' 'Lake County'] Total unique Counties : 38 Total Zipcodes with missing counties: 34
df[df.County.isna()]['ZIPCode'].value_counts() #unique zipcodes withe missing details
92717 22 96651 6 92634 5 93077 1 Name: ZIPCode, dtype: int64
for info in range(len(df)):
if df.ZIPCode[info] == 92717:
df.County[info]='Orange County'
elif df.ZIPCode[info] ==93077:
df.County[info]='Ventura County'
elif df.ZIPCode[info] ==92634:
df.County[info]='Orange County'
elif df.ZIPCode[info] ==96651:
df.County[info]='San Francisco County'
#Grouping the counties in 10 regions for better data analysis
Superior_California = ['Butte County',"El Dorado County","Placer County","Sacramento County","Shasta County","Siskiyou County","Yolo County"]
North_Coast = ["Humboldt County","Lake County","Mendocino County","Napa County","Sonoma County","Trinity County"]
San_Francisco_BayArea = ["Alameda County","Contra Costa County","Marin County","San Francisco County","San Mateo County","Santa Clara County","Solano County"]
Northern_SanJoaquin_Valley = ["Merced County","San Joaquin County","Tuolumne County","Stanislaus County"]
Central_Coast =["Monterey County","Ventura County","San Benito County","San Luis Obispo County","Santa Barbara County","Santa Cruz County"]
Southern_SanJoaquin_Valley=['Fresno County',"Kern County"]
Inland_Empire = ["San Bernardino County","Riverside County"]
Los_Angeles = ['Los Angeles County']
Orange = ["Orange County"]
San_Diego = ["Imperial County","San Diego County"]
#Creating a function to group all the Counties
def regions(x):
if x in Superior_California:
return "Superior_California"
elif x in North_Coast:
return "North_Coast"
elif x in San_Francisco_BayArea:
return "San_Francisco_BayArea"
elif x in Northern_SanJoaquin_Valley:
return "Northern_SanJoaquin_Valley"
elif x in Central_Coast:
return "Central_Coast"
elif x in Southern_SanJoaquin_Valley:
return "Southern_SanJoaquin_Valley"
elif x in Inland_Empire:
return "Inland_Empire"
elif x in Los_Angeles:
return "Los_Angeles"
elif x in Orange:
return "Orange"
elif x in San_Diego:
return "San_Diego"
else:
return x
df['Region'] = df['County'].apply(regions) #creating a new column regions
df.drop(['ID','ZIPCode','County'],axis=1,inplace=True)
#Dropping ID as its not relevant and Dropping ZIPCode and County because we will use Regions in further analysis
df['Education'] = df['Education'].astype('category')
df['Family'] = df['Family'].astype('category')
df['Personal_Loan'] = df['Personal_Loan'].astype('category')
df['Securities_Account'] = df['Securities_Account'].astype('category')
df['CD_Account'] = df['CD_Account'].astype('category')
df['Online'] = df['Online'].astype('category')
df['CreditCard'] = df['CreditCard'].astype('category')
df['Region'] = df['Region'].astype('category')
df.info() #rechecking the datatypes
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5000 entries, 0 to 4999 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 5000 non-null int64 1 Experience 5000 non-null int64 2 Income 5000 non-null int64 3 Family 5000 non-null category 4 CCAvg 5000 non-null float64 5 Education 5000 non-null category 6 Mortgage 5000 non-null int64 7 Personal_Loan 5000 non-null category 8 Securities_Account 5000 non-null category 9 CD_Account 5000 non-null category 10 Online 5000 non-null category 11 CreditCard 5000 non-null category 12 Region 5000 non-null category dtypes: category(8), float64(1), int64(4) memory usage: 235.6 KB
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Age | 5000.0 | 45.338400 | 11.463166 | 23.0 | 35.0 | 45.0 | 55.0 | 67.0 |
| Experience | 5000.0 | 20.104600 | 11.467954 | -3.0 | 10.0 | 20.0 | 30.0 | 43.0 |
| Income | 5000.0 | 73.774200 | 46.033729 | 8.0 | 39.0 | 64.0 | 98.0 | 224.0 |
| CCAvg | 5000.0 | 1.937938 | 1.747659 | 0.0 | 0.7 | 1.5 | 2.5 | 10.0 |
| Mortgage | 5000.0 | 56.498800 | 101.713802 | 0.0 | 0.0 | 0.0 | 101.0 | 635.0 |
df[df['Experience'] < 0]['Experience'].count() #finding columns with -ve experience values
52
df1=df[(df.Experience<0)]
print(f"The unique Negative Experience Array= {df1['Experience'].unique()}")
df1['Age'].value_counts(ascending=True)#finding the mean and median for w.r.t Age
The unique Negative Experience Array= [-1 -2 -3]
26 1 28 1 29 3 23 12 24 17 25 18 Name: Age, dtype: int64
#Let's check the actual experience distribution for the ages above
df2=df[(df.Experience>=0)][df.Age<30] #Since the age for -ve experience values is less than 30 yrs
df2.groupby(['Age']).agg([np.mean,np.median]).Experience
| mean | median | |
|---|---|---|
| Age | ||
| 24 | 0.000000 | 0 |
| 25 | 0.514286 | 1 |
| 26 | 0.987013 | 1 |
| 27 | 1.923077 | 2 |
| 28 | 3.009804 | 3 |
| 29 | 3.833333 | 4 |
multiplier = -1
for i in range(len(df)):
if df.Experience[i]<0:
df.Experience[i]=(df.Experience[i]*multiplier)
(df.Experience<0).value_counts()
False 5000 Name: Experience, dtype: int64
df.isna().sum()
Age 0 Experience 0 Income 0 Family 0 CCAvg 0 Education 0 Mortgage 0 Personal_Loan 0 Securities_Account 0 CD_Account 0 Online 0 CreditCard 0 Region 0 dtype: int64
#Performing Univariate Analysis to study the central tendency and dispersion
#Plotting histogram to study distribution
Uni_num = df.select_dtypes(include=np.number).columns.tolist()
plt.figure(figsize=(17,75))
for i in range(len(Uni_num)): #creating a loop that will show the plots for the columns in one plot
plt.subplot(18,3,i+1)
sns.histplot(df[Uni_num[i]],kde=False)
plt.tight_layout()
plt.title(Uni_num[i],fontsize=25)
plt.show()
plt.figure(figsize=(15,35))
for i in range(len(Uni_num)):
plt.subplot(10,3,i+1)
sns.boxplot(df[Uni_num[i]],showmeans=True, color='yellow')
plt.tight_layout()
plt.title(Uni_num[i],fontsize=25)
plt.show()
Observations:
Income:
Credit Card Average:
Mortgage
df3= df[(df.Mortgage>0)]
fig,(ax_box,ax_hist) = plt.subplots(2,1,sharex=True ,
figsize=(10,9),
gridspec_kw = {"height_ratios": (.35, .65)})
sns.boxplot(df3.Mortgage, ax=ax_box, showmeans=True, color='orange')
sns.histplot(df3.Mortgage, ax=ax_hist,kde=True)
<AxesSubplot:xlabel='Mortgage', ylabel='Count'>
Insights:
categorical_val = df.select_dtypes(exclude=np.number).columns.tolist()
plt.figure(figsize=(15,75))
for i in range(len(categorical_val)): #creating a loop that will show the plots for the columns in one plot
plt.subplot(18,3,i+1)
ax=sns.countplot(df[categorical_val[i]],palette='Spectral')
plt.tight_layout()
plt.title(categorical_val[i],fontsize=25)
total = len (df[categorical_val[i]])
for p in ax.patches:
percentage = '{:.1f}%'.format(100 * p.get_height()/total) # percentage of each class of the category
x = p.get_x() + (p.get_width() / 2)-0.1 # width of the plot
y = p.get_y() + p.get_height() # hieght of the plot
ax.annotate(percentage, (x, y), size = 12.5,color='black') # To annonate
plt.xticks(rotation=90)
plt.show()
Observations:
corr= df.corr()
plt.figure(figsize=(10,7))
sns.heatmap(corr,annot= True,vmin=0,vmax=1, cmap='RdYlGn_r',linewidths=0.75)
plt.show()
Observations:
sns.pairplot(data=df,hue='Personal_Loan')
<seaborn.axisgrid.PairGrid at 0x202980c68e0>
Observations:
# For all numerical variables with Personal_Loan
plt.figure(figsize=(20,10))
for i, variable in enumerate(Uni_num):
plt.subplot(3,2,i+1)
sns.boxplot(df['Personal_Loan'],df[variable],palette="Dark2")
plt.tight_layout()
plt.title(variable)
plt.show()
Observations:
Similarly the mean values for Experience is also almost equal for both categories of Personal Loan. Both these variables dont have any outliers
Customers who have Personal Loans also have high Mean Income and CreditCard Average compared to customers who dont have a loan. Interesting we see several outliers in the higher end for both these variables in Class 0.
The mean value for Mortgage at both levels in 0.0(in dollars). This is because majority of the customers dont have Mortgages. However, we see that customers with higher mortgages have Personal loans. But, we also see that there are several outliers in the high end again for customers who dont have a loan.
The above plot, suggests a correlation between Income,CCavg and Mortgage. Customers with high values for these variables have taken loans. This could suggests them as possible features of customers that can be targeted.
#Stacked plot of categorical variables with Personal Loans
def stacked_plot(x):
sns.set(palette='Accent')
tab1 = pd.crosstab(x,df['Personal_Loan'],margins=True)
print(tab1)
print('-'*120)
tab = pd.crosstab(x,df['Personal_Loan'],normalize='index')
tab.plot(kind='bar',stacked=True,figsize=(10,5))
plt.legend(loc='lower left', frameon=True)
plt.legend(loc="upper left", bbox_to_anchor=(1,1))
plt.ylabel('Percentage')
plt.show()
stacked_plot(df.Family)
Personal_Loan 0 1 All Family 1 1365 107 1472 2 1190 106 1296 3 877 133 1010 4 1088 134 1222 All 4520 480 5000 ------------------------------------------------------------------------------------------------------------------------
stacked_plot(df.Education)
Personal_Loan 0 1 All Education 1 2003 93 2096 2 1221 182 1403 3 1296 205 1501 All 4520 480 5000 ------------------------------------------------------------------------------------------------------------------------
stacked_plot(df.Securities_Account)
Personal_Loan 0 1 All Securities_Account 0 4058 420 4478 1 462 60 522 All 4520 480 5000 ------------------------------------------------------------------------------------------------------------------------
stacked_plot(df.CD_Account)
Personal_Loan 0 1 All CD_Account 0 4358 340 4698 1 162 140 302 All 4520 480 5000 ------------------------------------------------------------------------------------------------------------------------
stacked_plot(df.Online)
Personal_Loan 0 1 All Online 0 1827 189 2016 1 2693 291 2984 All 4520 480 5000 ------------------------------------------------------------------------------------------------------------------------
stacked_plot(df.CreditCard)
Personal_Loan 0 1 All CreditCard 0 3193 337 3530 1 1327 143 1470 All 4520 480 5000 ------------------------------------------------------------------------------------------------------------------------
#Income Vs Education Vs Personal_Loan
plt.figure(figsize=(15,7))
sns.boxplot(data=df,y='Income',x='Education',hue='Personal_Loan')
plt.show()
plt.figure(figsize=(15,7))
sns.boxplot(data=df,y='Income',x='Family',hue='Personal_Loan')
plt.show()
plt.figure(figsize=(15,7))
sns.boxplot(data=df,y='Mortgage',x='Family',hue='Personal_Loan')
plt.show()
plt.figure(figsize=(15,7))
sns.boxplot(data=df,y='CCAvg',x='CreditCard',hue='Personal_Loan')
plt.show()
plt.figure(figsize=(15,7))
sns.scatterplot(data=df,y='Income',x='CCAvg',hue='Personal_Loan')
plt.show()
Observations:
>2.5(in thousand dollars) have personal loans.df1=df.copy() # new copy for Decision Tree model
# Lets treat outliers by flooring and capping
def treat_outliers(df,col):
Q1=df[col].quantile(0.25) # 25th quantile
Q3=df[col].quantile(0.75) # 75th quantile
IQR=Q3-Q1
Lower_Whisker = Q1 - 1.5*IQR
Upper_Whisker = Q3 + 1.5*IQR
df[col] = np.clip(df[col], Lower_Whisker, Upper_Whisker) # all the values samller than Lower_Whisker will be assigned value of Lower_whisker
# and all the values above upper_whisker will be assigned value of upper_Whisker
return df
def treat_outliers_all(df, col_list): # treat outliers in numerical column of Dataframe
for c in col_list:
df = treat_outliers(df,c)
return df
no_treatment = {'Age','Experience'} # These two variables dont have outliers
numerical_col = [ele for ele in Uni_num if ele not in no_treatment]
#Applying outlier treatment
df = treat_outliers_all(df,numerical_col)
#Defining a function for Confusion matrix
from sklearn.metrics import classification_report,confusion_matrix
sns.set(font_scale=2.0) # to set font size for the matrix
def make_confusion_matrix(y_actual,y_predict):
'''
y_predict: prediction of class
y_actual : ground truth
'''
cm=confusion_matrix(y_actual,y_predict)
group_names = ['True -ve','False +ve','False -ve','True +ve']
group_counts = ["{0:0.0f}".format(value) for value in
cm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
cm.flatten()/np.sum(cm)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2,v3 in
zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
plt.figure(figsize = (10,7))
sns.heatmap(cm, annot=labels,fmt='',cmap='Blues')
plt.ylabel('True label')
plt.xlabel('Predicted label')
#Importing all necessary libraries
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import statsmodels.api as sm
from sklearn import metrics #accuracy,confusion metrics, etc
from sklearn import datasets
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
## Defining X and Y variables
X = df.drop(['Personal_Loan'], axis=1) #dropping the dependent variable
Y = df[['Personal_Loan']]
#Convert categorical variables to dummy variables
X = pd.get_dummies(X, drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.30,random_state=29) # 70% train set and 30% test set
logreg = LogisticRegression(solver='saga',max_iter=1000,penalty='none',verbose=True,n_jobs=1,random_state=29)
# There arae several optimizer, we are using optimizer called as 'saga' with max_iter equal to 1000
# max_iter indicates number of iteration needed to converge
logreg.fit(X_train, y_train)
pred_train = logreg.predict(X_train)
pred_test = logreg.predict(X_test)
#Checking the Accuracy of the model:
print('\nAccuracy on train data:%.6f'%accuracy_score(y_train, pred_train) )
print('Accuracy on test data:%.6f' %accuracy_score(y_test, pred_test))
#checking the Recall metrics of the model:
print('\nRecall on train data:%.6f'%recall_score(y_train, pred_train) )
print('Recall on test data:%.6f'%recall_score(y_test, pred_test))
#checking the Precision metrics of model:
print("\nPrecision on training set : ",precision_score(y_train, pred_train))
print("Precision on test set : ",precision_score(y_test, pred_test))
print("\nF1 Score on training set : ",f1_score(y_train, pred_train))
print("F1 Score on test set : ",f1_score(y_test, pred_test))
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
max_iter reached after 5 seconds Accuracy on train data:0.938000 Accuracy on test data:0.938667 Recall on train data:0.524781 Recall on test data:0.459854 Precision on training set : 0.7692307692307693 Precision on test set : 0.7777777777777778 F1 Score on training set : 0.6239168110918544 F1 Score on test set : 0.5779816513761469
[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 4.9s finished
make_confusion_matrix(y_test,pred_test) #display confusion matrix for test set
Observations:
# adding constant to training and test set
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)
#Defining a funciton to call all the performance metrics scores
def metrics_score(model,train,test,train_y,test_y):
'''
Function to calculate different metric scores of the model - Accuracy, Recall, Precision, and F1 score
model: classifier to predict values of X
train, test: Independent features
train_y,test_y: Dependent variable '''
pred = model.predict(train)
pred_train = list(map(round,pred))
pred1 = model.predict(test)
pred_test = list(map(round,pred1))
print("Accuracy on training set : ",accuracy_score(train_y,pred_train))
print("Accuracy on test set : ",accuracy_score(test_y,pred_test))
print("Recall on training set : ",recall_score(train_y,pred_train))
print("Recall on test set : ",recall_score(test_y,pred_test))
print("Precision on training set : ",precision_score(train_y,pred_train))
print("Precision on test set : ",precision_score(test_y,pred_test))
print("F1 on training set : ",f1_score(train_y,pred_train))
print("F1 on test set : ",f1_score(test_y,pred_test))
logit = sm.Logit(y_train, X_train) #logistic regression
lg = logit.fit(warn_convergence =False)
#Checking model performance
metrics_score(lg,X_train,X_test,y_train,y_test)
Optimization terminated successfully.
Current function value: 0.103672
Iterations 10
Accuracy on training set : 0.964
Accuracy on test set : 0.966
Recall on training set : 0.7288629737609329
Recall on test set : 0.6788321167883211
Precision on training set : 0.8833922261484098
Precision on test set : 0.93
F1 on training set : 0.7987220447284346
F1 on test set : 0.7848101265822783
Observations:
cm_pred = lg.predict(X_test)
pred_test = list(map(round,cm_pred))
make_confusion_matrix(y_test,pred_test)
#checking the VIF scores for X_train set
vif_series1 = pd.Series([variance_inflation_factor(X_train.values,i) for i in range(X_train.shape[1])],index=X_train.columns)
print('Series before feature selection: \n\n{}\n'.format(vif_series1))
Series before feature selection: const 460.837931 Age 92.257546 Experience 92.165438 Income 1.863532 CCAvg 1.714957 Mortgage 1.025765 Family_2 1.395899 Family_3 1.384109 Family_4 1.416701 Education_2 1.303900 Education_3 1.331429 Securities_Account_1 1.129095 CD_Account_1 1.333988 Online_1 1.044979 CreditCard_1 1.116427 Region_Inland_Empire 1.280061 Region_Los_Angeles 2.519934 Region_North_Coast 1.161298 Region_Northern_SanJoaquin_Valley 1.084884 Region_Orange 1.644761 Region_San_Diego 1.964014 Region_San_Francisco_BayArea 3.001965 Region_Southern_SanJoaquin_Valley 1.161376 Region_Superior_California 1.654779 dtype: float64
Observations:
X_train1 = X_train.drop('Experience', axis=1)
X_test1 = X_test.drop('Experience', axis=1)
vif_series2 = pd.Series([variance_inflation_factor(X_train1.values,i) for i in range(X_train1.shape[1])],index=X_train1.columns)
print('Series before feature selection: \n\n{}\n'.format(vif_series2))
Series before feature selection: const 37.671334 Age 1.013455 Income 1.860348 CCAvg 1.712338 Mortgage 1.025717 Family_2 1.395774 Family_3 1.377508 Family_4 1.415131 Education_2 1.292653 Education_3 1.259825 Securities_Account_1 1.128764 CD_Account_1 1.332697 Online_1 1.044978 CreditCard_1 1.116378 Region_Inland_Empire 1.278800 Region_Los_Angeles 2.519678 Region_North_Coast 1.161059 Region_Northern_SanJoaquin_Valley 1.084882 Region_Orange 1.643245 Region_San_Diego 1.963651 Region_San_Francisco_BayArea 3.001944 Region_Southern_SanJoaquin_Valley 1.160531 Region_Superior_California 1.654603 dtype: float64
logit1=sm.Logit(y_train,X_train1)
lg1=logit1.fit()
metrics_score(lg1,X_train1,X_test1,y_train,y_test)
Optimization terminated successfully.
Current function value: 0.103721
Iterations 10
Accuracy on training set : 0.9645714285714285
Accuracy on test set : 0.966
Recall on training set : 0.7288629737609329
Recall on test set : 0.6788321167883211
Precision on training set : 0.8896797153024911
Precision on test set : 0.93
F1 on training set : 0.8012820512820513
F1 on test set : 0.7848101265822783
Observations:
Variable Significance:
print(lg1.summary())
Logit Regression Results
==============================================================================
Dep. Variable: Personal_Loan No. Observations: 3500
Model: Logit Df Residuals: 3477
Method: MLE Df Model: 22
Date: Tue, 01 Jun 2021 Pseudo R-squ.: 0.6765
Time: 22:38:36 Log-Likelihood: -363.02
converged: True LL-Null: -1122.3
Covariance Type: nonrobust LLR p-value: 3.072e-308
=====================================================================================================
coef std err z P>|z| [0.025 0.975]
-----------------------------------------------------------------------------------------------------
const -14.3822 0.923 -15.577 0.000 -16.192 -12.573
Age 0.0107 0.009 1.211 0.226 -0.007 0.028
Income 0.0653 0.004 16.344 0.000 0.057 0.073
CCAvg 0.4269 0.076 5.591 0.000 0.277 0.577
Mortgage 0.0016 0.001 1.616 0.106 -0.000 0.004
Family_2 -0.1526 0.289 -0.528 0.597 -0.719 0.414
Family_3 2.3026 0.319 7.227 0.000 1.678 2.927
Family_4 1.5065 0.303 4.969 0.000 0.912 2.101
Education_2 4.1499 0.348 11.933 0.000 3.468 4.831
Education_3 4.1986 0.346 12.144 0.000 3.521 4.876
Securities_Account_1 -0.3856 0.362 -1.066 0.287 -1.095 0.324
CD_Account_1 3.7991 0.428 8.873 0.000 2.960 4.638
Online_1 -0.6782 0.213 -3.185 0.001 -1.096 -0.261
CreditCard_1 -1.1213 0.282 -3.976 0.000 -1.674 -0.569
Region_Inland_Empire -0.0071 0.671 -0.011 0.992 -1.322 1.308
Region_Los_Angeles 0.0891 0.394 0.226 0.821 -0.683 0.861
Region_North_Coast 0.2064 0.952 0.217 0.828 -1.660 2.073
Region_Northern_SanJoaquin_Valley -1.7951 1.548 -1.159 0.246 -4.830 1.240
Region_Orange 0.0615 0.493 0.125 0.901 -0.904 1.027
Region_San_Diego 0.2343 0.437 0.537 0.591 -0.621 1.090
Region_San_Francisco_BayArea 0.0529 0.375 0.141 0.888 -0.683 0.789
Region_Southern_SanJoaquin_Valley 0.6988 0.833 0.839 0.402 -0.934 2.332
Region_Superior_California -0.1346 0.528 -0.255 0.799 -1.170 0.901
=====================================================================================================
Possibly complete quasi-separation: A fraction 0.16 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified.
Insights
#dropping all dummy variables of Region
X_train2 = X_train1.drop(['Region_Inland_Empire','Region_Los_Angeles','Region_North_Coast','Region_Northern_SanJoaquin_Valley','Region_Orange','Region_San_Diego','Region_San_Francisco_BayArea','Region_Southern_SanJoaquin_Valley','Region_Superior_California'], axis=1)
X_test2 = X_test1.drop(['Region_Inland_Empire','Region_Los_Angeles','Region_North_Coast','Region_Northern_SanJoaquin_Valley','Region_Orange','Region_San_Diego','Region_San_Francisco_BayArea','Region_Southern_SanJoaquin_Valley','Region_Superior_California'], axis=1)
logit2=sm.Logit(y_train,X_train2)
lg2=logit2.fit()
#print(lg2.summary())
#Lets look at model performance
metrics_score(lg2,X_train2,X_test2,y_train,y_test)
Optimization terminated successfully.
Current function value: 0.104151
Iterations 10
Accuracy on training set : 0.9634285714285714
Accuracy on test set : 0.9666666666666667
Recall on training set : 0.7259475218658892
Recall on test set : 0.6861313868613139
Precision on training set : 0.8798586572438163
Precision on test set : 0.9306930693069307
F1 on training set : 0.7955271565495207
F1 on test set : 0.7899159663865546
#Let's drop Age
X_train3 = X_train2.drop(['Age'],axis=1)
X_test3 = X_test2.drop(['Age'],axis=1)
logit3=sm.Logit(y_train,X_train3)
lg3=logit3.fit()
metrics_score(lg3,X_train3,X_test3,y_train,y_test)
Optimization terminated successfully.
Current function value: 0.104340
Iterations 10
Accuracy on training set : 0.9631428571428572
Accuracy on test set : 0.966
Recall on training set : 0.7201166180758017
Recall on test set : 0.6861313868613139
Precision on training set : 0.8821428571428571
Precision on test set : 0.9215686274509803
F1 on training set : 0.7929373996789727
F1 on test set : 0.7866108786610878
#Let's drop Mortgage
X_train4 = X_train3.drop(['Mortgage'],axis=1)
X_test4 = X_test3.drop(['Mortgage'],axis=1)
logit4=sm.Logit(y_train,X_train4)
lg4=logit4.fit()
metrics_score(lg4,X_train4,X_test4,y_train,y_test)
Optimization terminated successfully.
Current function value: 0.104704
Iterations 10
Accuracy on training set : 0.9637142857142857
Accuracy on test set : 0.9666666666666667
Recall on training set : 0.717201166180758
Recall on test set : 0.6934306569343066
Precision on training set : 0.8913043478260869
Precision on test set : 0.9223300970873787
F1 on training set : 0.7948303715670436
F1 on test set : 0.7916666666666667
print(lg4.summary())
Logit Regression Results
==============================================================================
Dep. Variable: Personal_Loan No. Observations: 3500
Model: Logit Df Residuals: 3488
Method: MLE Df Model: 11
Date: Tue, 01 Jun 2021 Pseudo R-squ.: 0.6735
Time: 22:38:36 Log-Likelihood: -366.46
converged: True LL-Null: -1122.3
Covariance Type: nonrobust LLR p-value: 0.000
========================================================================================
coef std err z P>|z| [0.025 0.975]
----------------------------------------------------------------------------------------
const -13.6386 0.735 -18.561 0.000 -15.079 -12.198
Income 0.0651 0.004 16.400 0.000 0.057 0.073
CCAvg 0.4130 0.075 5.496 0.000 0.266 0.560
Family_2 -0.1298 0.287 -0.452 0.651 -0.692 0.433
Family_3 2.3261 0.317 7.331 0.000 1.704 2.948
Family_4 1.5408 0.302 5.102 0.000 0.949 2.133
Education_2 4.0866 0.341 11.984 0.000 3.418 4.755
Education_3 4.1152 0.339 12.135 0.000 3.451 4.780
Securities_Account_1 -0.4244 0.359 -1.183 0.237 -1.127 0.279
CD_Account_1 3.7953 0.423 8.981 0.000 2.967 4.624
Online_1 -0.6549 0.211 -3.107 0.002 -1.068 -0.242
CreditCard_1 -1.1232 0.278 -4.036 0.000 -1.669 -0.578
========================================================================================
Possibly complete quasi-separation: A fraction 0.15 of observations can be
perfectly predicted. This might indicate that there is complete
quasi-separation. In this case some parameters will not be identified.
Hence, we will use lg4 as the final model
Odds ratio = Exp(coef)
Probability = odds/(1+odds)
#Calculate Odds Ratio, probability
##create a data frame to collate Odds ratio, probability and p-value of the coef
lgcoef = pd.DataFrame(lg4.params, columns=['coef']) #getting the coefficent from lg4 model
lgcoef.loc[:, "Odds_ratio"] = np.exp(lgcoef.coef) #calculate the odds ratio
lgcoef['probability'] = lgcoef['Odds_ratio']/(1+lgcoef['Odds_ratio']) #calculate the probability
lgcoef['pval']=lg.pvalues
pd.options.display.float_format = '{:.2f}'.format
# Filter by significant p-value (pval <0.005) and sort descending by Odds ratio
lgcoef = lgcoef.sort_values(by="Odds_ratio", ascending=False)
pval_filter = lgcoef['pval']<=0.005
lgcoef[pval_filter]
| coef | Odds_ratio | probability | pval | |
|---|---|---|---|---|
| Education_3 | 4.12 | 61.26 | 0.98 | 0.00 |
| Education_2 | 4.09 | 59.54 | 0.98 | 0.00 |
| CD_Account_1 | 3.80 | 44.49 | 0.98 | 0.00 |
| Family_3 | 2.33 | 10.24 | 0.91 | 0.00 |
| Family_4 | 1.54 | 4.67 | 0.82 | 0.00 |
| CCAvg | 0.41 | 1.51 | 0.60 | 0.00 |
| Income | 0.07 | 1.07 | 0.52 | 0.00 |
| Online_1 | -0.65 | 0.52 | 0.34 | 0.00 |
| CreditCard_1 | -1.12 | 0.33 | 0.25 | 0.00 |
| const | -13.64 | 0.00 | 0.00 | 0.00 |
Observations:
Confusion matrix Prediction on lg4 model Test Data
pred1 = lg4.predict(X_test4)
pred_test1 = list(map(round,pred1))
make_confusion_matrix(y_test,pred_test1)
Observations:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, lg4.predict(X_test4))
fpr, tpr, thresholds = roc_curve(y_test, lg4.predict(X_test4))
plt.figure(figsize=(13,8))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
Optimal Threshold from AUC-ROC
# Optimal threshold as per AUC-ROC curve
optimal_idx = np.argmax(tpr - fpr)
optimal = thresholds[optimal_idx]
print(optimal)
0.2016926863819639
#Applying the optimal threshold to predict model for test data
y_pred_train = (lg4.predict(X_train4)>optimal).astype(int)
y_pred_test = (lg4.predict(X_test4)>optimal).astype(int)
#Confusion matrix for test set for lg4 model
make_confusion_matrix(y_test,y_pred_test)
print("Accuracy on training set : ",accuracy_score(y_train,y_pred_train))
print("Accuracy on test set : ",accuracy_score(y_test,y_pred_test))
print("\nRecall on training set : ",recall_score(y_train,y_pred_train))
print("Recall on test set : ",recall_score(y_test,y_pred_test))
print("\nPrecision on training set : ",precision_score(y_train,y_pred_train))
print("Precision on test set : ",precision_score(y_test, y_pred_test))
print("\nF1 Score on training set : ",f1_score(y_train,y_pred_train))
print("F1 Score on test set : ",f1_score(y_test, y_pred_test))
Accuracy on training set : 0.9497142857142857 Accuracy on test set : 0.94 Recall on training set : 0.8483965014577259 Recall on test set : 0.8321167883211679 Precision on training set : 0.7012048192771084 Precision on test set : 0.6298342541436464 F1 Score on training set : 0.7678100263852243 F1 Score on test set : 0.7169811320754716
Observations
from sklearn.metrics import precision_recall_curve
y_PresRec=lg4.predict(X_train4)
prec, rec, tre = precision_recall_curve(y_train, y_PresRec)
def plot_prec_recall_vs_tresh(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], 'b--', label='precision')
plt.plot(thresholds, recalls[:-1], 'g--', label = 'recall')
plt.xlabel('Threshold')
plt.legend(loc='upper left')
plt.ylim([0,1])
plt.figure(figsize=(10,7))
plot_prec_recall_vs_tresh(prec, rec, tre)
plt.show()
#Applying the optimal threshold to predict model for test data
optimal_threshold = 0.25 # we get a balanced recall and precision at this threshold
y_pred_train1 = (lg4.predict(X_train4)>optimal_threshold).astype(int)
y_pred_test1 = (lg4.predict(X_test4)>optimal_threshold).astype(int)
#Confusion matrix for test set for lg4 model
make_confusion_matrix(y_test,y_pred_test1)
print("Accuracy on training set : ",accuracy_score(y_train,y_pred_train1))
print("Accuracy on test set : ",accuracy_score(y_test,y_pred_test1))
print("\nRecall on training set : ",recall_score(y_train,y_pred_train1))
print("Recall on test set : ",recall_score(y_test,y_pred_test1))
print("\nPrecision on training set : ",precision_score(y_train,y_pred_train1))
print("Precision on test set : ",precision_score(y_test, y_pred_test1))
print("\nF1 Score on training set : ",f1_score(y_train,y_pred_train1))
print("F1 Score on test set : ",f1_score(y_test, y_pred_test1))
Accuracy on training set : 0.956 Accuracy on test set : 0.9493333333333334 Recall on training set : 0.8367346938775511 Recall on test set : 0.8102189781021898 Precision on training set : 0.7454545454545455 Precision on test set : 0.6894409937888198 F1 Score on training set : 0.7884615384615385 F1 Score on test set : 0.7449664429530201
Observations
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt
## Defining X and Y variables
X = df.drop(['Personal_Loan'], axis=1)
Y = df[['Personal_Loan']]
#Convert categorical variables to dummy variables
X = pd.get_dummies(X, drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.30)
# Fit the model on train
m = LogisticRegression(solver='newton-cg',n_jobs=-1,random_state=0)
# we will first build model with all
sfs = SFS(m, k_features=22, forward=True, floating=False, scoring='recall', verbose=2, cv=5)
sfs = sfs.fit(X_train, y_train)
fig = plot_sfs(sfs.get_metric_dict(),kind='std_dev')
plt.ylim([0, 1])
plt.title('Sequential Forward Selection (w. StdDev)')
plt.grid()
plt.show()
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 7.2s remaining: 0.0s [Parallel(n_jobs=1)]: Done 23 out of 23 | elapsed: 10.8s finished [2021-06-01 22:38:51] Features: 1/22 -- score: 0.35393939393939394[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.7s remaining: 0.0s [Parallel(n_jobs=1)]: Done 22 out of 22 | elapsed: 13.8s finished [2021-06-01 22:39:05] Features: 2/22 -- score: 0.46060606060606063[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.8s remaining: 0.0s [Parallel(n_jobs=1)]: Done 21 out of 21 | elapsed: 14.0s finished [2021-06-01 22:39:19] Features: 3/22 -- score: 0.5948717948717949[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.5s remaining: 0.0s [Parallel(n_jobs=1)]: Done 20 out of 20 | elapsed: 13.0s finished [2021-06-01 22:39:32] Features: 4/22 -- score: 0.6158508158508158[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.7s remaining: 0.0s [Parallel(n_jobs=1)]: Done 19 out of 19 | elapsed: 13.5s finished [2021-06-01 22:39:46] Features: 5/22 -- score: 0.6373426573426573[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 1.1s remaining: 0.0s [Parallel(n_jobs=1)]: Done 18 out of 18 | elapsed: 14.6s finished [2021-06-01 22:40:01] Features: 6/22 -- score: 0.6464801864801865[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.8s remaining: 0.0s [Parallel(n_jobs=1)]: Done 17 out of 17 | elapsed: 14.8s finished [2021-06-01 22:40:16] Features: 7/22 -- score: 0.6495104895104895[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.5s remaining: 0.0s [Parallel(n_jobs=1)]: Done 16 out of 16 | elapsed: 12.2s finished [2021-06-01 22:40:28] Features: 8/22 -- score: 0.6495104895104895[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.6s remaining: 0.0s [Parallel(n_jobs=1)]: Done 15 out of 15 | elapsed: 10.3s finished [2021-06-01 22:40:38] Features: 9/22 -- score: 0.6495104895104895[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.8s remaining: 0.0s [Parallel(n_jobs=1)]: Done 14 out of 14 | elapsed: 11.2s finished [2021-06-01 22:40:50] Features: 10/22 -- score: 0.6495104895104895[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.5s remaining: 0.0s [Parallel(n_jobs=1)]: Done 13 out of 13 | elapsed: 8.4s finished [2021-06-01 22:40:58] Features: 11/22 -- score: 0.6495104895104895[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.8s remaining: 0.0s [Parallel(n_jobs=1)]: Done 12 out of 12 | elapsed: 9.1s finished [2021-06-01 22:41:07] Features: 12/22 -- score: 0.6495104895104895[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.9s remaining: 0.0s [Parallel(n_jobs=1)]: Done 11 out of 11 | elapsed: 8.2s finished [2021-06-01 22:41:16] Features: 13/22 -- score: 0.6464335664335665[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.7s remaining: 0.0s [Parallel(n_jobs=1)]: Done 10 out of 10 | elapsed: 7.2s finished [2021-06-01 22:41:23] Features: 14/22 -- score: 0.6495104895104895[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.7s remaining: 0.0s [Parallel(n_jobs=1)]: Done 9 out of 9 | elapsed: 7.1s finished [2021-06-01 22:41:30] Features: 15/22 -- score: 0.6495104895104895[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.8s remaining: 0.0s [Parallel(n_jobs=1)]: Done 8 out of 8 | elapsed: 6.0s finished [2021-06-01 22:41:36] Features: 16/22 -- score: 0.6525407925407926[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.8s remaining: 0.0s [Parallel(n_jobs=1)]: Done 7 out of 7 | elapsed: 5.7s finished [2021-06-01 22:41:42] Features: 17/22 -- score: 0.6525407925407926[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 1.1s remaining: 0.0s [Parallel(n_jobs=1)]: Done 6 out of 6 | elapsed: 5.2s finished [2021-06-01 22:41:47] Features: 18/22 -- score: 0.6465268065268066[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 1.1s remaining: 0.0s [Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 4.4s finished [2021-06-01 22:41:52] Features: 19/22 -- score: 0.6707692307692307[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 1.1s remaining: 0.0s [Parallel(n_jobs=1)]: Done 4 out of 4 | elapsed: 3.7s finished [2021-06-01 22:41:56] Features: 20/22 -- score: 0.6707692307692307[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 1.1s remaining: 0.0s [Parallel(n_jobs=1)]: Done 3 out of 3 | elapsed: 2.9s finished [2021-06-01 22:41:59] Features: 21/22 -- score: 0.6677389277389276[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 1.3s remaining: 0.0s [Parallel(n_jobs=1)]: Done 2 out of 2 | elapsed: 2.6s finished [2021-06-01 22:42:01] Features: 22/22 -- score: 0.6616783216783217
sfs1 = SFS(m,k_features=19, forward=True, floating=False, scoring='recall', verbose=2, cv=5)
sfs1 = sfs1.fit(X_train, y_train)
fig1 = plot_sfs(sfs1.get_metric_dict(),kind='std_dev')
plt.ylim([0, 1])
plt.title('Sequential Forward Selection (w. StdDev)')
plt.grid()
plt.show()
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.1s remaining: 0.0s [Parallel(n_jobs=1)]: Done 23 out of 23 | elapsed: 3.2s finished [2021-06-01 22:42:05] Features: 1/19 -- score: 0.35393939393939394[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.6s remaining: 0.0s [Parallel(n_jobs=1)]: Done 22 out of 22 | elapsed: 12.9s finished [2021-06-01 22:42:18] Features: 2/19 -- score: 0.46060606060606063[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.5s remaining: 0.0s [Parallel(n_jobs=1)]: Done 21 out of 21 | elapsed: 13.3s finished [2021-06-01 22:42:32] Features: 3/19 -- score: 0.5948717948717949[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.5s remaining: 0.0s [Parallel(n_jobs=1)]: Done 20 out of 20 | elapsed: 13.1s finished [2021-06-01 22:42:45] Features: 4/19 -- score: 0.6158508158508158[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.6s remaining: 0.0s [Parallel(n_jobs=1)]: Done 19 out of 19 | elapsed: 12.6s finished [2021-06-01 22:42:58] Features: 5/19 -- score: 0.6373426573426573[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.7s remaining: 0.0s [Parallel(n_jobs=1)]: Done 18 out of 18 | elapsed: 12.6s finished [2021-06-01 22:43:10] Features: 6/19 -- score: 0.6464801864801865[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.9s remaining: 0.0s [Parallel(n_jobs=1)]: Done 17 out of 17 | elapsed: 13.1s finished [2021-06-01 22:43:23] Features: 7/19 -- score: 0.6495104895104895[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.7s remaining: 0.0s [Parallel(n_jobs=1)]: Done 16 out of 16 | elapsed: 13.1s finished [2021-06-01 22:43:37] Features: 8/19 -- score: 0.6495104895104895[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.8s remaining: 0.0s [Parallel(n_jobs=1)]: Done 15 out of 15 | elapsed: 11.4s finished [2021-06-01 22:43:48] Features: 9/19 -- score: 0.6495104895104895[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.8s remaining: 0.0s [Parallel(n_jobs=1)]: Done 14 out of 14 | elapsed: 10.3s finished [2021-06-01 22:43:59] Features: 10/19 -- score: 0.6495104895104895[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.8s remaining: 0.0s [Parallel(n_jobs=1)]: Done 13 out of 13 | elapsed: 10.3s finished [2021-06-01 22:44:09] Features: 11/19 -- score: 0.6495104895104895[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.8s remaining: 0.0s [Parallel(n_jobs=1)]: Done 12 out of 12 | elapsed: 9.3s finished [2021-06-01 22:44:18] Features: 12/19 -- score: 0.6495104895104895[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.8s remaining: 0.0s [Parallel(n_jobs=1)]: Done 11 out of 11 | elapsed: 8.7s finished [2021-06-01 22:44:27] Features: 13/19 -- score: 0.6464335664335665[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.8s remaining: 0.0s [Parallel(n_jobs=1)]: Done 10 out of 10 | elapsed: 9.3s finished [2021-06-01 22:44:37] Features: 14/19 -- score: 0.6495104895104895[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 0.9s remaining: 0.0s [Parallel(n_jobs=1)]: Done 9 out of 9 | elapsed: 8.8s finished [2021-06-01 22:44:45] Features: 15/19 -- score: 0.6495104895104895[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 1.4s remaining: 0.0s [Parallel(n_jobs=1)]: Done 8 out of 8 | elapsed: 8.2s finished [2021-06-01 22:44:54] Features: 16/19 -- score: 0.6525407925407926[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 1.0s remaining: 0.0s [Parallel(n_jobs=1)]: Done 7 out of 7 | elapsed: 7.0s finished [2021-06-01 22:45:01] Features: 17/19 -- score: 0.6525407925407926[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 1.3s remaining: 0.0s [Parallel(n_jobs=1)]: Done 6 out of 6 | elapsed: 7.0s finished [2021-06-01 22:45:08] Features: 18/19 -- score: 0.6465268065268066[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 1.4s remaining: 0.0s [Parallel(n_jobs=1)]: Done 5 out of 5 | elapsed: 6.1s finished [2021-06-01 22:45:14] Features: 19/19 -- score: 0.6707692307692307
#Which are the important features?
feat_cols = list(sfs1.k_feature_idx_)
print(feat_cols)
[1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 21, 22]
#Looking at the column names
X_train.columns[feat_cols]
Index(['Experience', 'Income', 'CCAvg', 'Family_2', 'Family_3', 'Family_4',
'Education_2', 'Education_3', 'Securities_Account_1', 'CD_Account_1',
'CreditCard_1', 'Region_Inland_Empire', 'Region_Los_Angeles',
'Region_North_Coast', 'Region_Northern_SanJoaquin_Valley',
'Region_Orange', 'Region_San_Diego',
'Region_Southern_SanJoaquin_Valley', 'Region_Superior_California'],
dtype='object')
#Creating new X_train and X_test with the selected columns
X_train_final = X_train[X_train.columns[feat_cols]]
X_test_final = X_test[X_train_final.columns]
#Fitting logistic regression model
logreg1 = LogisticRegression(solver='saga',max_iter=1000,penalty='none',verbose=True,n_jobs=1,random_state=29)
logreg1.fit(X_train_final, y_train)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
max_iter reached after 4 seconds
[Parallel(n_jobs=1)]: Done 1 out of 1 | elapsed: 4.3s finished
LogisticRegression(max_iter=1000, n_jobs=1, penalty='none', random_state=29,
solver='saga', verbose=True)
#Lets check the model performance
metrics_score(logreg1,X_train_final,X_test_final,y_train,y_test)
Accuracy on training set : 0.9442857142857143 Accuracy on test set : 0.944 Recall on training set : 0.4817073170731707 Recall on test set : 0.5131578947368421 Precision on training set : 0.8633879781420765 Precision on test set : 0.8863636363636364 F1 on training set : 0.6183953033268103 F1 on test set : 0.65
pred_test2 = logreg1.predict(X_test_final)
print("confusion matrix = \n")
make_confusion_matrix(y_test,pred_test2)
confusion matrix =
#AOC-RUC Curve
SFS_roc_auc = roc_auc_score(y_test, logreg1.predict_proba(X_test_final)[:,1])
fpr, tpr, thresholds = roc_curve(y_test, logreg1.predict_proba(X_test_final)[:,1])
plt.figure(figsize=(13,8))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % SFS_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
optimal_idx1 = np.argmax(tpr - fpr)
optimal_threshold1 = thresholds[optimal_idx1]
print(optimal_threshold1)
0.12837422849711064
y_pred_trn = (logreg1.predict(X_train_final)>optimal_threshold)
y_pred_tst = (logreg1.predict(X_test_final)>optimal_threshold)
# let us make confusion matrix after optimal threshold has been choosen
make_confusion_matrix(y_test,y_pred_tst)
print('Accuracy on train data:',accuracy_score(y_train, y_pred_trn) )
print('Accuracy on test data:',accuracy_score(y_test, y_pred_tst))
print('\nRcall on train data:',recall_score(y_train, y_pred_trn) )
print('Recall on test data:',recall_score(y_test, y_pred_tst))
Accuracy on train data: 0.9442857142857143 Accuracy on test data: 0.944 Rcall on train data: 0.4817073170731707 Recall on test data: 0.5131578947368421
Observations:
df1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 5000 entries, 0 to 4999 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Age 5000 non-null int64 1 Experience 5000 non-null int64 2 Income 5000 non-null int64 3 Family 5000 non-null category 4 CCAvg 5000 non-null float64 5 Education 5000 non-null category 6 Mortgage 5000 non-null int64 7 Personal_Loan 5000 non-null category 8 Securities_Account 5000 non-null category 9 CD_Account 5000 non-null category 10 Online 5000 non-null category 11 CreditCard 5000 non-null category 12 Region 5000 non-null category dtypes: category(8), float64(1), int64(4) memory usage: 235.6 KB
X= df1.drop(['Personal_Loan'],axis=1)
y=df1['Personal_Loan']
# encoding the categorical variables
X = pd.get_dummies(X, drop_first=True)
# Splitting data into training and test set:
X_train,X_test, y_train, y_test =train_test_split(X,y, test_size=0.3, random_state=29)
print(X_train.shape,X_test.shape)
(3500, 23) (1500, 23)
from sklearn import tree
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
Dt = DecisionTreeClassifier(criterion='gini',class_weight={0:0.15,1:0.85},random_state=29)
Dt.fit(X_train,y_train)
DecisionTreeClassifier(class_weight={0: 0.15, 1: 0.85}, random_state=29)
y_predict = Dt.predict(X_test)
make_confusion_matrix(y_test,y_predict)
y_train.value_counts(1)
0 0.90 1 0.10 Name: Personal_Loan, dtype: float64
Observations:
True Negative - 90.53%
We also see that there are only 10% of the Class '1'.
We want to maintain the False Negative ie. wrongly identifying customers as non-buyers but they actually purchase a loan as low as possible.
Hence Recall is the metric to be used
def scores(model):
""" model : classifier to predict X values """
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
print("Accuracy on training set : ",metrics.accuracy_score(y_train,y_pred_train))
print("Accuracy on test set : ",metrics.accuracy_score(y_test,y_pred_test))
print("\nRecall on training set : ",metrics.recall_score(y_train,y_pred_train))
print("Recall on test set : ",metrics.recall_score(y_test,y_pred_test))
print("\nPrecision on training set : ",metrics.precision_score(y_train,y_pred_train))
print("Precision on test set : ",metrics.precision_score(y_test,y_pred_test))
print("\nF1 on training set : ",metrics.f1_score(y_train,y_pred_train))
print("F1 on test set : ",metrics.f1_score(y_test,y_pred_test))
#Let's calculate the Accuracy and Recall Score of the model
scores(Dt)
Accuracy on training set : 1.0 Accuracy on test set : 0.988 Recall on training set : 1.0 Recall on test set : 0.9051094890510949 Precision on training set : 1.0 Precision on test set : 0.9612403100775194 F1 on training set : 1.0 F1 on test set : 0.9323308270676692
column_names = list(X.columns)
print(column_names)
['Age', 'Experience', 'Income', 'CCAvg', 'Mortgage', 'Family_2', 'Family_3', 'Family_4', 'Education_2', 'Education_3', 'Securities_Account_1', 'CD_Account_1', 'Online_1', 'CreditCard_1', 'Region_Inland_Empire', 'Region_Los_Angeles', 'Region_North_Coast', 'Region_Northern_SanJoaquin_Valley', 'Region_Orange', 'Region_San_Diego', 'Region_San_Francisco_BayArea', 'Region_Southern_SanJoaquin_Valley', 'Region_Superior_California']
plt.figure(figsize=(20,30))
out = tree.plot_tree(Dt,feature_names=column_names,filled=True,fontsize=8,node_ids=True,class_names=True)
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor('black')
arrow.set_linewidth(1)
plt.show()
# Text report showing the rules of a decision tree -
print(tree.export_text(Dt,feature_names=column_names,show_weights=True))
|--- Income <= 98.50 | |--- CCAvg <= 2.95 | | |--- weights: [369.90, 0.00] class: 0 | |--- CCAvg > 2.95 | | |--- CD_Account_1 <= 0.50 | | | |--- Income <= 81.50 | | | | |--- Experience <= 8.50 | | | | | |--- Family_4 <= 0.50 | | | | | | |--- weights: [0.00, 2.55] class: 1 | | | | | |--- Family_4 > 0.50 | | | | | | |--- weights: [1.05, 0.00] class: 0 | | | | |--- Experience > 8.50 | | | | | |--- Family_3 <= 0.50 | | | | | | |--- weights: [7.05, 0.00] class: 0 | | | | | |--- Family_3 > 0.50 | | | | | | |--- weights: [2.10, 0.00] class: 0 | | | |--- Income > 81.50 | | | | |--- CCAvg <= 4.20 | | | | | |--- Mortgage <= 148.00 | | | | | | |--- CCAvg <= 3.05 | | | | | | | |--- weights: [0.60, 0.00] class: 0 | | | | | | |--- CCAvg > 3.05 | | | | | | | |--- Experience <= 21.50 | | | | | | | | |--- Age <= 29.50 | | | | | | | | | |--- CreditCard_1 <= 0.50 | | | | | | | | | | |--- weights: [0.00, 1.70] class: 1 | | | | | | | | | |--- CreditCard_1 > 0.50 | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | |--- Age > 29.50 | | | | | | | | | |--- CreditCard_1 <= 0.50 | | | | | | | | | | |--- weights: [1.65, 0.00] class: 0 | | | | | | | | | |--- CreditCard_1 > 0.50 | | | | | | | | | | |--- Family_4 <= 0.50 | | | | | | | | | | | |--- weights: [0.45, 0.00] class: 0 | | | | | | | | | | |--- Family_4 > 0.50 | | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | |--- Experience > 21.50 | | | | | | | | |--- Region_North_Coast <= 0.50 | | | | | | | | | |--- Age <= 54.50 | | | | | | | | | | |--- weights: [0.00, 5.10] class: 1 | | | | | | | | | |--- Age > 54.50 | | | | | | | | | | |--- Family_4 <= 0.50 | | | | | | | | | | | |--- truncated branch of depth 3 | | | | | | | | | | |--- Family_4 > 0.50 | | | | | | | | | | | |--- weights: [0.45, 0.00] class: 0 | | | | | | | | |--- Region_North_Coast > 0.50 | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | |--- Mortgage > 148.00 | | | | | | |--- Family_2 <= 0.50 | | | | | | | |--- weights: [1.05, 0.00] class: 0 | | | | | | |--- Family_2 > 0.50 | | | | | | | |--- weights: [0.45, 0.00] class: 0 | | | | |--- CCAvg > 4.20 | | | | | |--- weights: [2.85, 0.00] class: 0 | | |--- CD_Account_1 > 0.50 | | | |--- CCAvg <= 4.25 | | | | |--- weights: [0.00, 6.80] class: 1 | | | |--- CCAvg > 4.25 | | | | |--- Age <= 43.50 | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | |--- Age > 43.50 | | | | | |--- weights: [0.30, 0.00] class: 0 |--- Income > 98.50 | |--- Education_3 <= 0.50 | | |--- Education_2 <= 0.50 | | | |--- Family_3 <= 0.50 | | | | |--- Family_4 <= 0.50 | | | | | |--- Income <= 104.50 | | | | | | |--- CCAvg <= 3.06 | | | | | | | |--- CCAvg <= 0.35 | | | | | | | | |--- weights: [0.45, 0.00] class: 0 | | | | | | | |--- CCAvg > 0.35 | | | | | | | | |--- weights: [2.40, 0.00] class: 0 | | | | | | |--- CCAvg > 3.06 | | | | | | | |--- CCAvg <= 5.15 | | | | | | | | |--- Experience <= 17.00 | | | | | | | | | |--- Family_2 <= 0.50 | | | | | | | | | | |--- weights: [0.60, 0.00] class: 0 | | | | | | | | | |--- Family_2 > 0.50 | | | | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | | | |--- Experience > 17.00 | | | | | | | | | |--- weights: [0.00, 3.40] class: 1 | | | | | | | |--- CCAvg > 5.15 | | | | | | | | |--- Experience <= 22.00 | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | |--- Experience > 22.00 | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | |--- Income > 104.50 | | | | | | |--- Region_Los_Angeles <= 0.50 | | | | | | | |--- weights: [51.60, 0.00] class: 0 | | | | | | |--- Region_Los_Angeles > 0.50 | | | | | | | |--- weights: [15.30, 0.00] class: 0 | | | | |--- Family_4 > 0.50 | | | | | |--- Income <= 106.00 | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | |--- Income > 106.00 | | | | | | |--- CCAvg <= 0.25 | | | | | | | |--- weights: [0.00, 1.70] class: 1 | | | | | | |--- CCAvg > 0.25 | | | | | | | |--- weights: [0.00, 19.55] class: 1 | | | |--- Family_3 > 0.50 | | | | |--- Income <= 108.50 | | | | | |--- weights: [0.90, 0.00] class: 0 | | | | |--- Income > 108.50 | | | | | |--- Income <= 116.00 | | | | | | |--- Online_1 <= 0.50 | | | | | | | |--- Experience <= 18.00 | | | | | | | | |--- weights: [0.00, 1.70] class: 1 | | | | | | | |--- Experience > 18.00 | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | |--- Online_1 > 0.50 | | | | | | | |--- weights: [0.60, 0.00] class: 0 | | | | | |--- Income > 116.00 | | | | | | |--- weights: [0.00, 23.80] class: 1 | | |--- Education_2 > 0.50 | | | |--- Income <= 116.50 | | | | |--- CCAvg <= 2.80 | | | | | |--- Income <= 106.50 | | | | | | |--- weights: [3.30, 0.00] class: 0 | | | | | |--- Income > 106.50 | | | | | | |--- Mortgage <= 51.50 | | | | | | | |--- CCAvg <= 0.95 | | | | | | | | |--- weights: [0.45, 0.00] class: 0 | | | | | | | |--- CCAvg > 0.95 | | | | | | | | |--- Family_3 <= 0.50 | | | | | | | | | |--- Experience <= 3.50 | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | | |--- Experience > 3.50 | | | | | | | | | | |--- Experience <= 35.00 | | | | | | | | | | | |--- truncated branch of depth 4 | | | | | | | | | | |--- Experience > 35.00 | | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | |--- Family_3 > 0.50 | | | | | | | | | |--- Age <= 54.00 | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | | |--- Age > 54.00 | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | |--- Mortgage > 51.50 | | | | | | | |--- Online_1 <= 0.50 | | | | | | | | |--- weights: [0.60, 0.00] class: 0 | | | | | | | |--- Online_1 > 0.50 | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | |--- CCAvg > 2.80 | | | | | |--- Age <= 60.00 | | | | | | |--- weights: [0.00, 9.35] class: 1 | | | | | |--- Age > 60.00 | | | | | | |--- Family_4 <= 0.50 | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | | | |--- Family_4 > 0.50 | | | | | | | |--- weights: [0.75, 0.00] class: 0 | | | |--- Income > 116.50 | | | | |--- Income <= 118.50 | | | | | |--- weights: [0.00, 2.55] class: 1 | | | | |--- Income > 118.50 | | | | | |--- weights: [0.00, 88.40] class: 1 | |--- Education_3 > 0.50 | | |--- Income <= 114.50 | | | |--- CCAvg <= 2.35 | | | | |--- Family_3 <= 0.50 | | | | | |--- Mortgage <= 247.50 | | | | | | |--- Region_Superior_California <= 0.50 | | | | | | | |--- weights: [3.45, 0.00] class: 0 | | | | | | |--- Region_Superior_California > 0.50 | | | | | | | |--- weights: [0.30, 0.00] class: 0 | | | | | |--- Mortgage > 247.50 | | | | | | |--- Age <= 34.50 | | | | | | | |--- weights: [0.30, 0.00] class: 0 | | | | | | |--- Age > 34.50 | | | | | | | |--- weights: [0.00, 0.85] class: 1 | | | | |--- Family_3 > 0.50 | | | | | |--- Income <= 108.50 | | | | | | |--- weights: [0.30, 0.00] class: 0 | | | | | |--- Income > 108.50 | | | | | | |--- weights: [0.00, 1.70] class: 1 | | | |--- CCAvg > 2.35 | | | | |--- Age <= 64.00 | | | | | |--- CD_Account_1 <= 0.50 | | | | | | |--- Family_2 <= 0.50 | | | | | | | |--- Securities_Account_1 <= 0.50 | | | | | | | | |--- weights: [0.00, 9.35] class: 1 | | | | | | | |--- Securities_Account_1 > 0.50 | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | |--- Family_2 > 0.50 | | | | | | | |--- Region_San_Diego <= 0.50 | | | | | | | | |--- Region_Los_Angeles <= 0.50 | | | | | | | | | |--- CCAvg <= 4.20 | | | | | | | | | | |--- Income <= 111.00 | | | | | | | | | | | |--- weights: [0.00, 1.70] class: 1 | | | | | | | | | | |--- Income > 111.00 | | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | | |--- CCAvg > 4.20 | | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | | |--- Region_Los_Angeles > 0.50 | | | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | | |--- Region_San_Diego > 0.50 | | | | | | | | |--- weights: [0.30, 0.00] class: 0 | | | | | |--- CD_Account_1 > 0.50 | | | | | | |--- Region_San_Francisco_BayArea <= 0.50 | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | | |--- Region_San_Francisco_BayArea > 0.50 | | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | |--- Age > 64.00 | | | | | |--- Online_1 <= 0.50 | | | | | | |--- weights: [0.15, 0.00] class: 0 | | | | | |--- Online_1 > 0.50 | | | | | | |--- weights: [0.30, 0.00] class: 0 | | |--- Income > 114.50 | | | |--- Income <= 117.00 | | | | |--- Age <= 42.50 | | | | | |--- weights: [0.30, 0.00] class: 0 | | | | |--- Age > 42.50 | | | | | |--- weights: [0.00, 2.55] class: 1 | | | |--- Income > 117.00 | | | | |--- Region_Northern_SanJoaquin_Valley <= 0.50 | | | | | |--- weights: [0.00, 98.60] class: 1 | | | | |--- Region_Northern_SanJoaquin_Valley > 0.50 | | | | | |--- weights: [0.00, 0.85] class: 1
Observations:
importance = Dt.feature_importances_
indices = np.argsort(importance)
plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importance[indices], color='green', align='center')
plt.yticks(range(len(indices)), [column_names[i] for i in indices])
plt.xlabel('Importance Value')
plt.show()
Observations:
The top five important features are:
The above tree is complex to interpret
from sklearn.model_selection import GridSearchCV
# Choose the type of classifier.
classifier = DecisionTreeClassifier(random_state=29,class_weight = {0:.15,1:.85}) #adding classweights
#Defining the Hyperparameters
parameters = {'max_depth': np.arange(1,11),
'criterion': ['gini'],
'splitter': ['best','random'],
'max_features': ['log2','sqrt']}
# Type of scoring used to compare parameter combinations
recall_scorer = metrics.make_scorer(metrics.recall_score)
# Run the grid search with the above parameters
grid_obj = GridSearchCV(classifier, parameters, scoring=recall_scorer,cv=5)
grid_obj = grid_obj.fit(X_train, y_train)
# Set to the best combination of parameters
classifier = grid_obj.best_estimator_
# Fit the best algorithm to the data.
classifier.fit(X_train, y_train)
DecisionTreeClassifier(class_weight={0: 0.15, 1: 0.85}, max_depth=4,
max_features='log2', random_state=29)
pred_test2 = classifier.predict(X_test)
make_confusion_matrix(y_test,pred_test2)
scores(classifier)
Accuracy on training set : 0.8071428571428572 Accuracy on test set : 0.8 Recall on training set : 0.9854227405247813 Recall on test set : 0.9343065693430657 Precision on training set : 0.3353174603174603 Precision on test set : 0.3054892601431981 F1 on training set : 0.5003700962250185 F1 on test set : 0.460431654676259
Observations:
plt.figure(figsize=(20,30))
out = tree.plot_tree(classifier,feature_names=column_names,filled=True,fontsize=11,node_ids=True,class_names=True)
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor('black')
arrow.set_linewidth(1)
plt.show()
# Text report showing the rules of a decision tree -
print(tree.export_text(classifier,feature_names=column_names,show_weights=True))
|--- CD_Account_1 <= 0.50 | |--- Income <= 98.50 | | |--- CCAvg <= 2.95 | | | |--- weights: [358.35, 0.00] class: 0 | | |--- CCAvg > 2.95 | | | |--- Online_1 <= 0.50 | | | | |--- weights: [7.35, 9.35] class: 1 | | | |--- Online_1 > 0.50 | | | | |--- weights: [10.80, 3.40] class: 0 | |--- Income > 98.50 | | |--- Family_3 <= 0.50 | | | |--- Experience <= 24.50 | | | | |--- weights: [53.55, 73.95] class: 1 | | | |--- Experience > 24.50 | | | | |--- weights: [24.30, 60.35] class: 1 | | |--- Family_3 > 0.50 | | | |--- Income <= 114.50 | | | | |--- weights: [3.30, 5.10] class: 1 | | | |--- Income > 114.50 | | | | |--- weights: [0.00, 50.15] class: 1 |--- CD_Account_1 > 0.50 | |--- CCAvg <= 2.95 | | |--- Region_San_Diego <= 0.50 | | | |--- CCAvg <= 2.31 | | | | |--- weights: [9.00, 22.95] class: 1 | | | |--- CCAvg > 2.31 | | | | |--- weights: [2.40, 0.85] class: 0 | | |--- Region_San_Diego > 0.50 | | | |--- Mortgage <= 207.50 | | | | |--- weights: [1.50, 0.00] class: 0 | | | |--- Mortgage > 207.50 | | | | |--- weights: [0.00, 0.85] class: 1 | |--- CCAvg > 2.95 | | |--- Family_4 <= 0.50 | | | |--- Family_2 <= 0.50 | | | | |--- weights: [1.05, 39.10] class: 1 | | | |--- Family_2 > 0.50 | | | | |--- weights: [1.95, 6.80] class: 1 | | |--- Family_4 > 0.50 | | | |--- weights: [0.00, 18.70] class: 1
importances = classifier.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='green', align='center')
plt.yticks(range(len(indices)), [column_names[i] for i in indices])
plt.xlabel('Importance Value')
plt.show()
Observations:
ccp_alpha to prune the treeccp_alphavalue, higher number of nodes will be pruned and the total impurity will also increaseFinding the ccp_alpha values
ccp = DecisionTreeClassifier(random_state=29,class_weight = {0:0.15,1:0.85})
ccp.fit(X_train,y_train)
path = ccp.cost_complexity_pruning_path(X_train, y_train) #finding the alpha and impurity values
ccp_alphas, impurities = path.ccp_alphas, path.impurities
pd.DataFrame(path)
#display as a dataframe
| ccp_alphas | impurities | |
|---|---|---|
| 0 | 0.00 | -0.00 |
| 1 | 0.00 | -0.00 |
| 2 | 0.00 | -0.00 |
| 3 | 0.00 | -0.00 |
| 4 | 0.00 | -0.00 |
| 5 | 0.00 | -0.00 |
| 6 | 0.00 | -0.00 |
| 7 | 0.00 | -0.00 |
| 8 | 0.00 | -0.00 |
| 9 | 0.00 | -0.00 |
| 10 | 0.00 | -0.00 |
| 11 | 0.00 | -0.00 |
| 12 | 0.00 | -0.00 |
| 13 | 0.00 | -0.00 |
| 14 | 0.00 | -0.00 |
| 15 | 0.00 | -0.00 |
| 16 | 0.00 | 0.00 |
| 17 | 0.00 | 0.00 |
| 18 | 0.00 | 0.00 |
| 19 | 0.00 | 0.00 |
| 20 | 0.00 | 0.00 |
| 21 | 0.00 | 0.00 |
| 22 | 0.00 | 0.00 |
| 23 | 0.00 | 0.01 |
| 24 | 0.00 | 0.01 |
| 25 | 0.00 | 0.01 |
| 26 | 0.00 | 0.01 |
| 27 | 0.00 | 0.01 |
| 28 | 0.00 | 0.01 |
| 29 | 0.00 | 0.01 |
| 30 | 0.00 | 0.01 |
| 31 | 0.00 | 0.01 |
| 32 | 0.00 | 0.01 |
| 33 | 0.00 | 0.01 |
| 34 | 0.00 | 0.02 |
| 35 | 0.00 | 0.02 |
| 36 | 0.00 | 0.02 |
| 37 | 0.00 | 0.02 |
| 38 | 0.00 | 0.02 |
| 39 | 0.00 | 0.02 |
| 40 | 0.00 | 0.03 |
| 41 | 0.00 | 0.03 |
| 42 | 0.00 | 0.03 |
| 43 | 0.00 | 0.03 |
| 44 | 0.00 | 0.03 |
| 45 | 0.00 | 0.04 |
| 46 | 0.00 | 0.04 |
| 47 | 0.00 | 0.04 |
| 48 | 0.00 | 0.05 |
| 49 | 0.00 | 0.05 |
| 50 | 0.00 | 0.05 |
| 51 | 0.00 | 0.06 |
| 52 | 0.00 | 0.06 |
| 53 | 0.00 | 0.07 |
| 54 | 0.01 | 0.07 |
| 55 | 0.02 | 0.09 |
| 56 | 0.03 | 0.22 |
| 57 | 0.25 | 0.47 |
#plotting alpha vs impurities
fig, ax = plt.subplots(figsize=(15,5))
ax.plot(ccp_alphas, impurities, marker='o', drawstyle="steps-post")
ax.set_xlabel("Alpha")
ax.set_ylabel("Total impurity of leaves")
ax.set_title("Total Impurity vs Alpha for training set")
plt.show()
# Finding the number of nodes in the last tree and the corresponding alpha value
clfs = [] #creating a empty list
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(random_state=29, ccp_alpha=ccp_alpha,class_weight = {0:0.15,1:0.85})
clf.fit(X_train, y_train) #apply classifier model with alpha values
clfs.append(clf)
print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(
clfs[-1].tree_.node_count, ccp_alphas[-1])) #finding the last node and its corresponding alpha
Number of nodes in the last tree is: 1 with ccp_alpha: 0.25368529246233246
Let's plot the Recall Vs Alpha values for both Train and Test set
#Creating empty lists for train and test recall
recall_train=[]
recall_test=[]
#run a loop to appead all recall scores for train and test at the alpha values
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(random_state=29, ccp_alpha=ccp_alpha,class_weight = {0:0.15,1:0.85})
clf.fit(X_train, y_train)
y_pred_train1 = clf.predict(X_train)
y_pred_test1 = clf.predict(X_test)
values_train = metrics.recall_score(y_train,y_pred_train1)
values_test= metrics.recall_score(y_test,y_pred_test1)
recall_train.append(values_train)
recall_test.append(values_test)
#plot the recall VS alpha
fig, ax = plt.subplots(figsize=(9,10))
ax.set_xlabel("alpha")
ax.set_ylabel("Recall")
ax.set_title("Recall vs alpha for training and testing sets")
ax.plot(ccp_alphas, recall_train, marker='o', label="train",
drawstyle="steps-post",)
ax.plot(ccp_alphas, recall_test, marker='o', label="test",
drawstyle="steps-post")
ax.legend(loc='lower left')
plt.show()
#Let's find the best alpha threshold for max recall
index_best_alpha = np.argmax(recall_test)
best_model = clfs[index_best_alpha]
print(best_model)
DecisionTreeClassifier(ccp_alpha=0.0042358886679584665,
class_weight={0: 0.15, 1: 0.85}, random_state=29)
Maximum Recall value is at alpha 0.0042. But at this alpha we will lose valuable business information and the decision tree might have very less nodes.
Hence we will use the point where the Recall values just begins to drop first; at alpha = 0.003. This will ensure we are retaining information and also get a high recall value.
#at alpha = 0.003
best_model2 = DecisionTreeClassifier(ccp_alpha=0.003,
class_weight={0: 0.15, 1: 0.85}, random_state=29)
best_model2.fit(X_train, y_train)
DecisionTreeClassifier(ccp_alpha=0.003, class_weight={0: 0.15, 1: 0.85},
random_state=29)
pred_test3=best_model2.predict(X_test)
make_confusion_matrix(y_test,pred_test3)
Observations:
scores(best_model2)
Accuracy on training set : 0.9748571428571429 Accuracy on test set : 0.9726666666666667 Recall on training set : 0.9416909620991254 Recall on test set : 0.9343065693430657 Precision on training set : 0.8260869565217391 Precision on test set : 0.8 F1 on training set : 0.8801089918256132 F1 on test set : 0.861952861952862
Observations:
plt.figure(figsize=(10,10))
out = tree.plot_tree(best_model2,feature_names=column_names,filled=True,fontsize=9,node_ids=True,class_names=True)
for o in out:
arrow = o.arrow_patch
if arrow is not None:
arrow.set_edgecolor('black')
arrow.set_linewidth(1)
plt.show()
plt.show()
# Text report showing the rules of a decision tree -
print(tree.export_text(best_model2,feature_names=column_names,show_weights=True))
|--- Income <= 98.50 | |--- CCAvg <= 2.95 | | |--- weights: [369.90, 0.00] class: 0 | |--- CCAvg > 2.95 | | |--- CD_Account_1 <= 0.50 | | | |--- weights: [18.15, 12.75] class: 0 | | |--- CD_Account_1 > 0.50 | | | |--- weights: [0.45, 6.80] class: 1 |--- Income > 98.50 | |--- Education_3 <= 0.50 | | |--- Education_2 <= 0.50 | | | |--- Family_3 <= 0.50 | | | | |--- Family_4 <= 0.50 | | | | | |--- Income <= 104.50 | | | | | | |--- CCAvg <= 3.06 | | | | | | | |--- weights: [2.85, 0.00] class: 0 | | | | | | |--- CCAvg > 3.06 | | | | | | | |--- weights: [0.90, 4.25] class: 1 | | | | | |--- Income > 104.50 | | | | | | |--- weights: [66.90, 0.00] class: 0 | | | | |--- Family_4 > 0.50 | | | | | |--- weights: [0.15, 21.25] class: 1 | | | |--- Family_3 > 0.50 | | | | |--- weights: [1.65, 25.50] class: 1 | | |--- Education_2 > 0.50 | | | |--- Income <= 116.50 | | | | |--- CCAvg <= 2.80 | | | | | |--- weights: [5.55, 4.25] class: 0 | | | | |--- CCAvg > 2.80 | | | | | |--- weights: [0.75, 10.20] class: 1 | | | |--- Income > 116.50 | | | | |--- weights: [0.00, 90.95] class: 1 | |--- Education_3 > 0.50 | | |--- Income <= 114.50 | | | |--- weights: [6.00, 13.60] class: 1 | | |--- Income > 114.50 | | | |--- weights: [0.30, 102.00] class: 1
importances2 = best_model2.feature_importances_
indices = np.argsort(importances2)
plt.figure(figsize=(12,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances2[indices], color='green', align='center')
plt.yticks(range(len(indices)), [column_names[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()
All_models = {'Model':['Logistic Regression Model-sklearn','Logistic Regression-Statsmodel-mutlicollinearity remvo','Logistic Regression-Optimal Threshold =0.2017','Logistic Regression-Optimal Threshold =0.25','Sequential Feature Selction Method','Initial Decision Tree','Decision treee- hyperparameter tuning(pre-pruning)',
'Decision tree- Cost Complexity post-pruning'],'Train_Accuracy':[0.9380,0.9640,0.9497,0.9560,0.944,1.0,0.8070,0.9749],'Test_Accuracy':[0.9387,0.9660,0.9400,0.9493,0.944,0.9880,0.80,0.972],'Train_Recall':[0.5245,0.7289,0.8484,0.8367,0.4817,1.0,0.9854,0.9417], 'Test_Recall':[0.4599,0.6788,0.8321,0.8102,0.5131,0.9051,0.9343,0.9343]}
comparison = pd.DataFrame(All_models)
comparison
| Model | Train_Accuracy | Test_Accuracy | Train_Recall | Test_Recall | |
|---|---|---|---|---|---|
| 0 | Logistic Regression Model-sklearn | 0.94 | 0.94 | 0.52 | 0.46 |
| 1 | Logistic Regression-Statsmodel-mutlicollineari... | 0.96 | 0.97 | 0.73 | 0.68 |
| 2 | Logistic Regression-Optimal Threshold =0.2017 | 0.95 | 0.94 | 0.85 | 0.83 |
| 3 | Logistic Regression-Optimal Threshold =0.25 | 0.96 | 0.95 | 0.84 | 0.81 |
| 4 | Sequential Feature Selction Method | 0.94 | 0.94 | 0.48 | 0.51 |
| 5 | Initial Decision Tree | 1.00 | 0.99 | 1.00 | 0.91 |
| 6 | Decision treee- hyperparameter tuning(pre-prun... | 0.81 | 0.80 | 0.99 | 0.93 |
| 7 | Decision tree- Cost Complexity post-pruning | 0.97 | 0.97 | 0.94 | 0.93 |
The most important features from this model are:
This model is also not affected by outliers or extreme values
df2=df1.copy() # making a new copy from the dataset without outlier treatment
A = df2.drop(['Personal_Loan'], axis=1) #dropping the dependent variable
B = df2[['Personal_Loan']]
A = pd.get_dummies(A, drop_first=True) #creat dummy variables
# Splitting data into training and test set:
A_train,A_test, B_train, B_test =train_test_split(A,B, test_size=0.3,random_state=1)
#split data
print(A_train.shape,A_test.shape)
(3500, 23) (1500, 23)
#apply the final model best_model2 to the train and test set
final_pred_train = best_model2.predict(A_train)
final_pred_test = best_model2.predict(A_test)
original_df = df2[df2.index.isin(A_train.index.values)].copy() #copying the indexes of the train set rows
original_df['Predicted'] = final_pred_train #adding new column with the predicted results
comparison_column = np.where(original_df["Predicted"] == original_df["Personal_Loan"], True, False) #identifying the misclassification
original_df['Misclassification'] = comparison_column
original_df['Misclassification'].value_counts()
True 2874 False 626 Name: Misclassification, dtype: int64
#Plot the misclassification of all variables to see if there is any pattern
num_values = original_df.select_dtypes(include=np.number).columns.tolist()
num_values.remove('Predicted')
plt.figure(figsize=(20,10))
for i, variable in enumerate(num_values):
plt.subplot(3,2,i+1)
sns.barplot(original_df['Misclassification'],original_df[variable],palette="Dark2")
plt.tight_layout()
plt.show()
cat_values = original_df.select_dtypes(exclude=np.number).columns.tolist()
cat_values.remove('Misclassification')
plt.figure(figsize=(17,25))
for i, variable in enumerate(cat_values):
plt.subplot(5,2,i+1)
sns.countplot(original_df[variable],hue=original_df['Misclassification'],palette="Dark2")
plt.tight_layout()
plt.legend(labels = ('F','T'),loc='upper left')
plt.xticks(rotation=90)
plt.show()
OBSERVATIONS:
About 19% of the data(285 entries) from the test set has been misclassified i.e The Predicted value of the model was not the same as Personal_loan variable in the dataset.
The miscalssifcation seeems to spread across all variables. But its significant on some
Income and CCAvg have high misclassifications. This is understandable as the model highlighted these two features as very important. Hence the model seems to have classified customers with high income and CCavg as potential loan borrowers
Among the categorical variables; again the misclassification is high for customer with CD_account; an important feature for the model.
sns.scatterplot(
original_df["Income"],
original_df["CCAvg"],
hue=original_df["CD_Account"],
palette= 'Set2')
<AxesSubplot:xlabel='Income', ylabel='CCAvg'>